Setup

Load Libraries

library(readxl)
library(DT)
library(data.table)
library(dplyr)
library(ggplot2)
library(plotly)
library(cowplot)
library(curl)
library(biomaRt)
library(sqldf) 
# *** susieR ****
# library(knitrBootstrap) #install_github('jimhester/knitrBootstrap')
library(susieR) # devtools::install_github("stephenslab/susieR") 

# *** finemapr ****
## finemapr contains: finemap, CAVIAR, and PAINTOR
#library(finemapr) # devtools::install_github("variani/finemapr") 
library(roxygen2)#roxygenize()  

# *** locuscomparer ****
# https://github.com/boxiangliu/locuscomparer
library(locuscomparer); #devtools::install_github("boxiangliu/locuscomparer")

# thm <- knitr::knit_theme$get("bipolar")
# knitr::knit_theme$set(thm)

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] locuscomparer_1.0.0 roxygen2_6.1.1      susieR_0.6.2.0411  
##  [4] sqldf_0.4-11        RSQLite_2.1.1       gsubfn_0.7         
##  [7] proto_1.0.0         biomaRt_2.38.0      curl_3.3           
## [10] cowplot_0.9.4       plotly_4.8.0        ggplot2_3.1.0      
## [13] dplyr_0.7.8         data.table_1.12.0   DT_0.5.1           
## [16] readxl_1.2.0       
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.0           lattice_0.20-38      tidyr_0.8.2         
##  [4] prettyunits_1.0.2    assertthat_0.2.0     digest_0.6.18       
##  [7] R6_2.3.0             cellranger_1.1.0     plyr_1.8.4          
## [10] chron_2.3-53         stats4_3.5.1         evaluate_0.12       
## [13] httr_1.4.0           pillar_1.3.1         rlang_0.3.1         
## [16] progress_1.2.0       lazyeval_0.2.1       blob_1.1.1          
## [19] S4Vectors_0.20.1     Matrix_1.2-15        rmarkdown_1.11      
## [22] stringr_1.3.1        htmlwidgets_1.3      RCurl_1.95-4.11     
## [25] bit_1.1-14           munsell_0.5.0        compiler_3.5.1      
## [28] xfun_0.4             pkgconfig_2.0.2      BiocGenerics_0.28.0 
## [31] htmltools_0.3.6      tcltk_3.5.1          tidyselect_0.2.5    
## [34] expm_0.999-3         tibble_2.0.1         IRanges_2.16.0      
## [37] XML_3.98-1.16        viridisLite_0.3.0    crayon_1.3.4        
## [40] withr_2.1.2          commonmark_1.7       bitops_1.0-6        
## [43] grid_3.5.1           jsonlite_1.6         gtable_0.2.0        
## [46] DBI_1.0.0            magrittr_1.5         scales_1.0.0        
## [49] stringi_1.2.4        bindrcpp_0.2.2       xml2_1.2.0          
## [52] tools_3.5.1          bit64_0.9-7          Biobase_2.42.0      
## [55] glue_1.3.0           purrr_0.3.0          hms_0.4.2           
## [58] parallel_3.5.1       yaml_2.2.0           AnnotationDbi_1.44.0
## [61] colorspace_1.4-0     memoise_1.1.0        knitr_1.21          
## [64] bindr_0.1.1
print(paste("susieR ", packageVersion("susieR")))
## [1] "susieR  0.6.2.411"

General Functions

createDT <- function(DF, caption="", scrollY=400){
  data <- DT::datatable(DF, caption=caption,
    extensions = 'Buttons',
    options = list( dom = 'Bfrtip', 
                    buttons = c('copy', 'csv', 'excel', 'pdf', 'print'), 
                    scrollY = scrollY, scrollX=T, scrollCollapse = T, paging = F,  
                      columnDefs = list(list(className = 'dt-center', targets = "_all"))
    )
  ) 
   return(data)
}
createDT_html <- function(DF, caption="", scrollY=400){
  print( htmltools::tagList( createDT(DF, caption, scrollY)) ) 
} 

Data Preprocessing

TSS Data

Use bioMart to get TSS positions for each gene

get_TSS_position <- function(gene){ 
  mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
  # att <- listAttributes(mart)
  # grep("transcription", att$name, value=TRUE)
  TSS <- getBM(mart=mart,
               attributes=c("hgnc_symbol","transcription_start_site", "version"), 
               filters="hgnc_symbol", values=gene)
} 

Import GWAS Summary Statistics

For each gene, get the position of top SNP (the one with the greatest effect size/Beta)

## Only the significant subset of results
Nalls_SS_sig <- read_excel("Data/Nalls2018_S2_SummaryStats.xlsx", sheet = "Data")
top_SNPs <- Nalls_SS_sig %>% group_by(`Nearest Gene`) %>% top_n(1, `Beta, all studies`) 
# lapply(Nalls_SS, function(x){print(paste(length(unique(x))))}) %>% data.frame() %>% t()
createDT(Nalls_SS_sig, caption = "Nalls (2018): Table S2. Detailed summary statistics on all nominated risk variants, known and novel")
# Nall (2018) QTL Stats
# Nalls_QTL <- read_excel("Data/Nalls2018_S4_QTL-MR.xlsx") 
# createDT(Nalls_QTL, caption = "Nalls (2018) Table S4. Expanded summary statistics for QTL Mendelian randomization") 

Assign SNPs to Gene

Nalls et al. (2019) data: https://github.com/neurogenetics/meta5

get_flanking_SNPs <- function(gene, top_SNPs, bp_distance=1000000){ 
  topSNP_sub <- subset(top_SNPs, `Nearest Gene`==gene)
  
  ## Full dataset  
  filePath <- "Data/META.PD.NALLS2014.PRS.tsv"# "Data/nallsEtal2019_no23andMe.tab.txt"
  # Get col names
  # strsplit( readLines(filePath, n = 2), "\t")   
  minPos <- topSNP_sub$BP - bp_distance
  maxPos <- topSNP_sub$BP + bp_distance
  
  geneSubset <- read.csv.sql(filePath, sep="\t", stringsAsFactors=F,
                           sql = paste('select * from file where CHR =',topSNP_sub$CHR,
                                       'AND POS BETWEEN', minPos, 'AND', maxPos)) 
  geneSubset <- geneSubset %>% dplyr::rename(SNP=MarkerName)
  return(geneSubset)
} 

susieR

  • Other fine-mapping tools included in susieR
  • susieR::N3finemapping.CAVIAR()
  • susieR::N3finemapping.DAP()
  • susieR::N3finemapping.FINEMAP()
  • Statistical Terms:
  • posterior inclusion probability (PIP)
  • coefficient estimate (Beta)
  • Effect allele frequency (EAF)
  • The I^2 statistic describes the percentage of variation across studies that seems not to be due to chance.

  • LDlink API
  • To get the LD matrices for each set of variants (without having to download and process an entire VCF file), use NIH’s LDLink API suite (specifically the LDmatrix tool) which uses data from the 1000 Genomes Project.
  • You first need to register to get your token here
  • Then specify which SNPs you want, and from which population(s)
  • A maximum of 300 SNPs are permitted by LDmatrix
  • All input variants must be on the same chromosome and match a bi-allelic variant

get_LDmatrix <- function(rs_list, token="df4298d58dc4", 
                         population_list=c("CEU","TSI","FIN","GBR","IBS"), 
                         R2_or_D="r2"){
  populations <- ifelse(length(population_list)==1, population_list, 
                        paste(population_list, collapse="%2B"))
  rs_IDs <- paste(rs_list, collapse="%0A")
  con <-curl(paste("https://ldlink.nci.nih.gov/LDlinkRest/ldmatrix?snps=", rs_IDs, 
                   "&pop=",populations,
                   "&r2_d=",R2_or_D,
                   "&token=",token, 
                   sep=""))
  open(con) 
  LD_matrix <- read.delim(con, header = T, row.names = "RS_number" )  
  LD_matrix <- LD_matrix %>% data.matrix(rownames.force = T)
  LD_matrix[is.na(LD_matrix)] <- 0 # | LD_matrix<0
   
  close(con)
  return(LD_matrix)
}


 
susie_on_gene <- function(gene, top_SNPs, SNP_limit=300){
  geneSubset <- get_flanking_SNPs(gene, top_SNPs) 
  geneSubset <- geneSubset%>% arrange(desc(abs(Effect)))
  geneSubset <- geneSubset[1:SNP_limit,]
  geneSubset$Coord <- paste(geneSubset$CHR,geneSubset$POS, sep="")
  #paste("chr",geneSubset$CHR,":",geneSubset$POS, sep="")
  # Subset summary stats 
  # cat("\n Preparing summary stats...\n")
  labelSNPs <- geneSubset[1:5,]
  
  before_plot <- ggplot(geneSubset, aes(x=Coord, y=Effect, label=SNP, color= -log(P.value) )) + 
    geom_hline(yintercept=0,alpha=.5, linetype=2, size=.5) +
    geom_point() + 
    geom_point(data=labelSNPs[1,], 
               pch=21, fill=NA, size=4, colour="red", stroke=1) +
    geom_segment(aes(xend=Coord, yend=0, color= -log(P.value)), alpha=.5) +
    geom_text(data=labelSNPs, aes(label=SNP), 
              vjust="inward",hjust="inward", nudge_y=.05, nudge_x=.25) +
    labs(title=paste(gene, "Before Fine Mapping",sep="\n"), y="Beta", x="BP Position",
         color="-log(p-value)\nAll Studies") +
    theme(plot.title = element_text(hjust = 0.5))
  # Get LD matrix from NIH API
  # cat("\n Downloading LD matrix...\n")
  LD_matrix <- get_LDmatrix(geneSubset$SNP )  
   
  # Run Susie
  # cat("\n Running SusieR...\n")  
  geneSubset_filt <- subset(geneSubset, SNP %in% row.names(LD_matrix)  )
  b <- geneSubset_filt$Effect
  se <- geneSubset_filt$StdErr
  
  fitted_bhat <- susie_bhat(bhat = b,
                            shat = se,
                            R = LD_matrix, 
                            n = nrow(LD_matrix), 
                            var_y = var(b),
                            L = 1, # Must lower to 1 if entering only 3 SNPs
                            scaled_prior_variance = 0.1, 
                            estimate_residual_variance = TRUE, 
                            estimate_prior_variance = FALSE, 
                            standardize = TRUE) 
  summary(fitted_bhat)$cs
  
  susieDF <- data.frame(SNP=names(fitted_bhat$X_column_scale_factors), PIP=fitted_bhat$pip) %>%
    base::merge(subset(geneSubset_filt, 
                                select=c("SNP","Effect","P.value", "Coord")), by="SNP") %>%
    arrange(desc(PIP))
  
  labelSNPs_PIP <- susieDF[1:5,]
  
  after_plot <- ggplot(susieDF, aes(x=Coord, y=PIP, label=SNP, color= -log(P.value) )) + 
    geom_hline(yintercept=0,alpha=.5, linetype=2, size=.5) +
    geom_point() +  
    geom_point(data=susieDF[susieDF$PIP == max(susieDF$PIP),], 
               pch=21, fill=NA, size=4, colour="green", stroke=1) +
    geom_segment(aes(xend=Coord, yend=0, color= -log(P.value))) +
    geom_text(data=labelSNPs_PIP, aes(label=SNP),
              vjust="inward",hjust="inward",nudge_x = .25) +
    labs(title=paste(gene, "After Fine Mapping",sep="\n"), y="PIP", x="BP Position",
         color="-log(p-value)\nAll Studies") +
    theme(plot.title = element_text(hjust = 0.5))#theme(legend.position="none") 
  
  plot_grid(before_plot, after_plot, nrow = 2) %>% print() 
  # susie_plot(fitted_bhat, y="PIP", b=b, add_bar = T) 
  
  createDT_html(susieDF, paste("susieR Results: ", gene), scrollY = 200)
  # createDT_html(LD_matrix, paste("LD matrix from 1000 Genomes: ", gene), scrollY = 200)
  return(susieDF)
}  

Fine Map with Summary Statistics

Only genes with at least 3 variants associated with them (from Nalls et al. (2019)) were included for fine mapping.

geneList <- c("LRRK2","GBAP1","SNCA","VPS13C","GCH1")#unique(top_SNPs$`Nearest Gene`) 
#Nalls_SS %>% group_by(`Nearest Gene`) %>% tally() %>% subset(n>2)

fineMapped_topSNPs <- data.table()

for (gene in geneList){ 
  cat("\n")
  cat("### ", gene, "\n")
  results <- susie_on_gene(gene, top_SNPs)
  fineMapped_topSNPs <- rbind(fineMapped_topSNPs, subset(results, PIP==max(PIP)) %>% as.data.table())
  cat("\n")
}

LRRK2

GBAP1

SNCA

VPS13C

GCH1

Top SNPs

createDT(fineMapped_topSNPs, "Potential Causal SNPs Identified by susieR", scrollY = 200)